{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Manipulating Molecules and Proteins within OpenBioMed\n",
    "\n",
    "OpenBioMed develops specific data structures for handling molecules, pockets, proteins and other data modalities in chemistry and life science. We provide easy-to-use APIs for input loading, data manipulation and format transformation."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/AIRvePFS/dair/luoyz-data/projects/OpenBioMed/OpenBioMed_arch\n"
     ]
    }
   ],
   "source": [
    "# Change working directory\n",
    "import os\n",
    "import sys\n",
    "parent = os.path.dirname(os.path.abspath(''))\n",
    "print(parent)\n",
    "sys.path.append(parent)\n",
    "os.chdir(parent)\n",
    "\n",
    "import logging\n",
    "logging.basicConfig(level=logging.ERROR)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we present several basic usage of the `Molecule` data structure."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "43 48\n",
      "[[ 7.15979818  0.80874434 -1.1777022 ]\n",
      " [ 6.03656999  0.98159063 -0.25523318]\n",
      " [ 6.5018441   1.18971222  1.11943242]\n",
      " [ 6.66523363 -0.20081618  1.6896087 ]\n",
      " [ 5.54799761 -1.00059728  1.0349241 ]\n",
      " [ 5.15551891 -0.20738398 -0.21709433]\n",
      " [ 3.67186942  0.23835869 -0.15975325]\n",
      " [ 2.79871947 -0.89883247 -0.09910995]\n",
      " [ 1.46873112 -0.51443636 -0.1461924 ]\n",
      " [ 1.15329933  0.75341864  0.16184495]\n",
      " [-0.15212908  1.07702788  0.11271499]\n",
      " [-1.13060623  0.13843592 -0.18651713]\n",
      " [-2.56195997  0.60224809 -0.26848812]\n",
      " [-2.61589085  1.98880147 -0.91233925]\n",
      " [-1.7825011   2.96467228 -0.19616682]\n",
      " [-0.53343712  2.50816882  0.45025218]\n",
      " [-2.50230208  3.96691016  0.47985061]\n",
      " [-3.13052426  3.64024452  1.69501868]\n",
      " [-3.93447426  4.55568201  2.36219254]\n",
      " [-4.12855294  5.818542    1.8206921 ]\n",
      " [-3.51034937  6.19212391  0.61558734]\n",
      " [-2.67001728  5.2823797  -0.0931741 ]\n",
      " [-2.08308779  5.79607309 -1.29704122]\n",
      " [-2.33174422  7.10892277 -1.74528179]\n",
      " [-3.17644012  7.95082858 -1.04553662]\n",
      " [-3.7554685   7.4915732   0.1285071 ]\n",
      " [-0.99163735  4.89717253 -2.30902942]\n",
      " [-0.70191467 -1.16676402 -0.45029655]\n",
      " [ 0.60055216 -1.49510271 -0.43956623]\n",
      " [-1.64679714 -2.12915293 -0.80618072]\n",
      " [-2.54437017 -2.56353793  0.27559944]\n",
      " [-2.08539457 -3.88347299  0.90232423]\n",
      " [-1.93167109 -4.97282649 -0.0844877 ]\n",
      " [-1.06227749 -4.61380138 -1.24475574]\n",
      " [-1.29295189 -3.15979488 -1.78961318]\n",
      " [-1.27938582 -5.56258492 -2.45360204]\n",
      " [-0.2174369  -5.40707465 -3.46087308]\n",
      " [ 0.63034382 -5.28808772 -4.24613934]\n",
      " [-1.77542539 -6.27450847  0.44844653]\n",
      " [-0.96085988 -7.08578706  0.01928373]\n",
      " [-2.64270006 -6.63396037  1.60393026]\n",
      " [-2.26090304 -7.45610279  2.59209198]\n",
      " [-3.86976173 -6.05818107  1.64092191]\n",
      " [ 6.79873013  0.55622683 -2.18555309]\n",
      " [ 7.73818536  1.7383868  -1.26932641]\n",
      " [ 7.86033797  0.03522178 -0.85261627]\n",
      " [ 5.73395043  1.73001106  1.67486844]\n",
      " [ 7.419509    1.78988913  1.19503237]\n",
      " [ 6.5962572  -0.22311772  2.77960104]\n",
      " [ 7.64226268 -0.61823704  1.40514846]\n",
      " [ 5.92360057 -2.00082331  0.76505514]\n",
      " [ 4.71342704 -1.145578    1.74648416]\n",
      " [ 5.29425385 -0.84813919 -1.10409702]\n",
      " [ 3.51786447  0.89570764  0.71191176]\n",
      " [ 3.44634771  0.8119402  -1.07047723]\n",
      " [-3.17026893 -0.07509033 -0.88742447]\n",
      " [-2.98709825  0.62720276  0.74431733]\n",
      " [-3.66005573  2.31527798 -0.97518097]\n",
      " [-2.24671867  1.91806773 -1.94550768]\n",
      " [-0.61223973  2.56657124  1.54657908]\n",
      " [ 0.28547577  3.17511762  0.14086687]\n",
      " [-3.01851963  2.64286832  2.11477936]\n",
      " [-4.4364677   4.27770288  3.28439649]\n",
      " [-4.78644737  6.51347537  2.34814639]\n",
      " [-1.86585325  7.48365619 -2.66101006]\n",
      " [-3.36810664  8.96373699 -1.39865854]\n",
      " [-4.41311635  8.16879901  0.66790813]\n",
      " [-3.56237454 -2.66852699 -0.14799078]\n",
      " [-2.61952358 -1.8139944   1.07031525]\n",
      " [-2.81165169 -4.1515399   1.68998666]\n",
      " [-1.10541696 -3.76139814  1.40803808]\n",
      " [-0.03124457 -4.68098514 -0.87226339]\n",
      " [-2.14762764 -3.18451022 -2.48573045]\n",
      " [-0.43654361 -2.83943951 -2.40281023]\n",
      " [-2.26024722 -5.34812236 -2.90580875]\n",
      " [-1.32754918 -6.62104859 -2.18439887]\n",
      " [-1.31476619 -7.99053664  2.5542106 ]\n",
      " [-2.91587013 -7.61759684  3.43215917]]\n",
      "./tmp/molecule_with_conformation.sdf\n"
     ]
    }
   ],
   "source": [
    "# Manipulating molecules\n",
    "from open_biomed.data.molecule import Molecule\n",
    "\n",
    "# Initialize a molecule with a SMILES string\n",
    "molecule = Molecule.from_smiles(\"CN1CCC[C@H]1COC2=NC3=C(CCN(C3)C4=CC=CC5=C4C(=CC=C5)Cl)C(=N2)N6CCN([C@H](C6)CC#N)C(=O)C(=C)F\")\n",
    "# Add a RDKit molecule object\n",
    "molecule._add_rdmol()\n",
    "# Then, by calling molecule.rdmol, you can get or modify information with RDKit\n",
    "print(molecule.rdmol.GetNumAtoms(), molecule.rdmol.GetNumBonds())\n",
    "# Add a 3D conformer\n",
    "molecule._add_conformer(mode='3D')\n",
    "# You can directly obtain an ndarray of 3D coordinates, which is synchronized in molecule.rdmol\n",
    "print(molecule.conformer)\n",
    "# Save the molecule as a .sdf file\n",
    "print(molecule.save_sdf(\"./tmp/molecule_with_conformation.sdf\"))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "C=C(F)C(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N True\n",
      "C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c(F)c(-c4nc(N)cc(C)c4C(F)(F)F)c(Cl)cc23)[C@@H](C)C1 ./tmp/pubchem_146624881.pkl\n",
      "0.31693989071038253\n",
      "C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(Cl)c24)C3)C[C@@H]1CC#N ./tmp/pubchem_134325731.pkl\n",
      "0.8769230769230769\n",
      "C=CC(=O)N1CCN(c2nc(OC[C@@H]3CCCN3C)nc3c2CCN(c2cccc4cccc(C)c24)C3)C[C@@H]1CC#N ./tmp/pubchem_134326084.pkl\n",
      "0.7941176470588235\n",
      "CN1CCC[C@H]1COc1nc(N2CCNCC2)c2cnc(-c3cccc4cccc(Cl)c34)c(F)c2n1 ./tmp/pubchem_155233433.pkl\n",
      "0.46153846153846156\n"
     ]
    }
   ],
   "source": [
    "import asyncio\n",
    "from open_biomed.core.web_request import PubChemRequester, PubChemStructureRequester\n",
    "from open_biomed.data.molecule import check_identical_molecules, molecule_fingerprint_similarity\n",
    "\n",
    "# We provide APIs for obtaining molecules from PubChem via molecule name or ID\n",
    "requester = PubChemRequester(db_url=\"https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/{accession}/SDF\")\n",
    "# Web request is performed via asynchronous calls\n",
    "pubchem_molecule = (await asyncio.create_task(requester.run(\"Adagrasib\")))[0][0]\n",
    "# Check if Adagrasib is the same molecule as the one we construct earlier\n",
    "print(pubchem_molecule, check_identical_molecules(molecule, pubchem_molecule))\n",
    "\n",
    "# We also provide APIs for searching struturally similar molecules on PubChem\n",
    "requester = PubChemStructureRequester()\n",
    "similar_molecules = (await asyncio.create_task(requester.run(\n",
    "    molecule=pubchem_molecule,    # query molecule\n",
    "    threshold=0.8,                # structural similarity threshold\n",
    "    max_records=5,                # maximum number of molecules\n",
    ")))\n",
    "# The first molecule is the requested molecule itself (identical)\n",
    "for i in range(1, 5):\n",
    "    # Check the SMILES and saved files of similar molecules\n",
    "    print(similar_molecules[0][i], similar_molecules[1][i])\n",
    "    # Let's check the Morgan fingerprint similarity of returned molecule!\n",
    "    # NOTE: the threshold value used in Pubchem is not Morgan fingerprint similarity, so the output scores could be lower than 0.8\n",
    "    print(molecule_fingerprint_similarity(pubchem_molecule, similar_molecules[0][i], fingerprint_type=\"morgan\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Here we present some basic usage of the `Protein` data structure"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.8164251207729468\n",
      "MTEYKLVVVGA-GGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNT-KSFEDIHHYREQIKRVKDSEDVPMVLVGNKCD-LPSRTVDTKQAQDLARSYGIPFIETSAKTRQR-VED-AFYTLVREIRQYRLK-KISKEEKTP----GCVKIKK------C-IIM-\n",
      "MTEYKLVVVGAVG-VGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINN-IKSFEDIHHYREQIKRVKDSEDVPMVLVGNKC-ALPSRTVDTKQAQDLARSYGIPFIETSAKTRQ-GV-DDAFYTLVREIR----KHK---E-K--MSKDG--K-KKKKSKTKCSI--L\n"
     ]
    }
   ],
   "source": [
    "# Manipulating proteins\n",
    "from open_biomed.data.protein import Protein, protein_sequence_similarity\n",
    "\n",
    "# Initalize the protein object with its amino acid sequence\n",
    "protein1 = Protein.from_fasta(\"MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQRVEDAFYTLVREIRQYRLKKISKEEKTPGCVKIKKCIIM\")\n",
    "protein2 = Protein.from_fasta(\"MTEYKLVVVGAVGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNIKSFEDIHHYREQIKRVKDSEDVPMVLVGNKCALPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKHKEKMSKDGKKKKKSKTKCSIL\")\n",
    "# Protein sequence alignment\n",
    "outputs = protein_sequence_similarity(protein1, protein2)\n",
    "print(outputs[0])\n",
    "print(outputs[1])\n",
    "print(outputs[2])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKSDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKH\n",
      "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVIDGETCLLDILDTAGQEEYSAMRDQYMRTGEGFLCVFAINNTKSFEDIHHYREQIKRVKDSEDVPMVLVGNKSDLPSRTVDTKQAQDLARSYGIPFIETSAKTRQGVDDAFYTLVREIRKH\n",
      "./tmp/msa_results_1741148484988/uniref.a3m\n",
      "./tmp/foldseek_results_1741148485003\n"
     ]
    }
   ],
   "source": [
    "import asyncio\n",
    "from open_biomed.core.web_request import PDBRequester, MSARequester, FoldSeekRequester\n",
    "\n",
    "# We provide APIs for obtaining proteins on PDB, AlphafoldDB and UniProt\n",
    "requester = PDBRequester()\n",
    "# Web request is performed via asynchronous calls\n",
    "protein_with_structure = (await asyncio.create_task(requester.run(\"4EPT\")))[0][0]\n",
    "print(protein_with_structure)\n",
    "# If the protein object has 3D structure, you can save it as a pdb file\n",
    "file = protein_with_structure.save_pdb()\n",
    "# We also provide APIs for loading a protein from a pdb file\n",
    "loaded_protein = Protein.from_pdb_file(file)\n",
    "print(loaded_protein)\n",
    "\n",
    "# We also provide APIs for performing MSA search and FoldSeek with web services\n",
    "requester = MSARequester()\n",
    "# NOTE: Performing MSA may take a long time and may occasionally fail\n",
    "outputs = (await asyncio.create_task(requester.run(loaded_protein)))[0][0]\n",
    "# Check the .a3m file for MSA results!\n",
    "print(outputs)\n",
    "\n",
    "requester = FoldSeekRequester(database=[\"afdb50\"], timeout=60)\n",
    "# NOTE: Performing FoldSeek may take a long time and may occasionally fail\n",
    "outputs = (await asyncio.create_task(requester.run(loaded_protein)))[0][0]\n",
    "# Check the folder for FoldSeek results!\n",
    "print(outputs)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.9.7 ('biomed')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.7"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "2b5492c31ef84abdc69aadb95e4c210f44c226a5800d1d766b22f7a50017392c"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
